#This file will recreate the final_dat.cvs from source data. 

library(readr); library(haven)
library(WDI)
library(countrycode)
library(assertthat)

setwd("~/Dropbox/ehrfruhag_accesspoints/Data Analysis") 
## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------#
#Import Source Data 

access_points <- read.csv("./Source Data/access_points.csv") 
gdp_pop<- read.delim("./Source Data/gdpv6.txt")
hr_scores <- read.csv("./Source Data/HumanRightsProtectionScores_v2.04.csv")
imf_con <- read.csv("./Source Data/imf_con.csv")
oil <- read.csv("./Source Data/Ross-Mahdavi Oil and Gas 1932-2014.csv")
conflict <- read.csv("./Source Data/conflict.csv")
untreaties <- read.csv("./Source Data/untreaties.csv")
V.Dem <- read_csv("./Source Data/V-Dem-CY+Others-v8.csv")
YouthBulges <- read_dta("./Source Data/YouthBulges_Urdal_ISQ_Posted.dta") 
trade_fdi <- WDI(indicator = c("NE.TRD.GNFS.ZS", "BX.KLT.DREM.CD.DT"), start = 1960, end=2003) 
workerr_latent <- read.csv("./Source Data/workerr_latent.csv")

## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------#
#Check for Duplicates, make a cowcode conversation for merging, only keep variables of interest
assert_that(!anyDuplicated(access_points[, c("ccode", "year")]))

colnames(gdp_pop) <- c("ccode", "is03c", "year", "population", "realgdp", "realgdppc1995", "realgdppccurrent", "origin") 
gdp_pop <- gdp_pop[,c("ccode", "year", "population", "realgdppccurrent")]
assert_that(!anyDuplicated(gdp_pop[, c("ccode", "year")]))

colnames(hr_scores) <- c("year", "ciri.name", "ccode", "disap", "kill", "polpris", "tort", "amnesty", "pts", "hathaway", "itt", "genocide", "rummel", "massive_repression", "executions","killings", "ciri", "latentmean", "latentsd") 
hr_scores <- hr_scores[, c("year", "ccode", "latentmean")] 
assert_that(!anyDuplicated(hr_scores[, c("ccode", "year")]))

assert_that(!anyDuplicated(imf_con[, c("ccode", "year")]))

oil <- oil[,c("cty_name", "year", "oil_gas_valuePOP_2014")]
oil <- oil[!(oil$cty_name %in% c("Ethiopia including Eritrea", "France including Algeria", "Serbia", "Serbia and Montenegro", "North Vietnam", "South Sudan", "Sudan including South Sudan", "Soviet Union", "Pakistan including East Pakistan", "Korea, Dem. Rep.", "Federal Republic of Germany (West Germany)", "South Vietnam")), ]
oil$ccode <- countrycode(oil$cty_name, "country.name", "cown") 
assert_that(!anyDuplicated(oil[, c("ccode", "year")]))

conflict <- conflict[!is.na(conflict$ccode), ]
assert_that(!anyDuplicated(conflict[, c("ccode", "year")]))


assert_that(!anyDuplicated(untreaties[, c("ccode", "year")]))

V.Dem <- V.Dem[, c("COWcode", "year", "e_p_polity")]
colnames(V.Dem) <- c("ccode", "year", "polity") 
V.Dem <- V.Dem[V.Dem$year>1944, ]
V.Dem <- V.Dem[V.Dem$ccode!=99999, ]
assert_that(!anyDuplicated(V.Dem[, c("ccode", "year")]))



YouthBulges <- YouthBulges[, c("countryname", "year", "ythblgap")] 
YouthBulges <- YouthBulges[!(YouthBulges$countryname %in% c("Cameroun", "French Polynesia", "Fed of Rhodesia and Nyasaland", "French Equatorial Africa", "French West Africa", "Gaza", "Guadeloupe", "Guam", "Hong Kong", "Leeward isls", "Macau", "Martinique", "Netherlands Antilles", "New Caledonia", "Pacific isls", "Puerto Rico", "Reunion", "Ruanda-Urundi", "United Arab Republic", "Vietnam, Republic of", "West Indies", "Western Sahara", "Windward isls")), ]
YouthBulges$ccode<- countrycode(YouthBulges$countryname, "country.name", "cown")
assert_that(!anyDuplicated(YouthBulges[, c("ccode", "year")]))


colnames(trade_fdi)<- c("iso2c", "country","year", "trade", "fdi") 
trade_fdi <- trade_fdi[trade_fdi$iso2c %in% c("AF", "AL", "DZ", "AD", "AO", "AG", "AR", "AM", "AU", "AT", "AZ", "BS", "BH", "BD", "BB", "BY", "BE", "BZ" ,"BJ", "BT" ,"BO", "BA", "BW", "BR", "BN", "BG", "BF", "BI", "CV", "KH", "CM", "CA", "CF", "TD", "CL", "CN", "CO", "KM", "CD", "CG", "CR", "CI", "HR", "CU", "CY", "CZ", "DK", "DJ", "DM", "DO", "EC", "EG", "SV", "GQ", "ER", "EE", "ET", "FJ", "FI", "FR", "GA", "GM", "GE", "DE", "GH", "GR", "GD", "GT", "GN", "GW", "GY", "HT", "HN", "HU", "IS", "IN", "ID", "IR", "IQ", "IE", "IL", "IT", "JM", "JP", "JO", "KZ", "KE", "KI", "KP", "KR", "KW", "KG", "LA", "LV", "LB", "LS", "LR", "LY", "LI", "LT", "LU", "MK", "MG", "MW", "MY", "MV", "ML", "MT", "MH", "MR", "MU", "MX", "FM", "MD", "MC", "MN", "ME", "MA", "MZ", "MM", "NA", "NP", "NL", "NZ", "NI", "NE", "NG", "NO", "OM", "PK", "PW", "PA", "PG", "PY", "PE", "PH", "PL", "PT", "QA", "RO", "RU", "RW", "WS", "SM", "ST", "SA", "SN", "SC", "SL", "SG", "SK", "SI", "SB", "SO", "ZA", "SS", "ES", "LK", "KN", "LC", "VC", "SD", "SR", "SZ", "SE", "CH", "SY", "TJ", "TZ", "TH", "TL", "TG", "TO", "TT", "TN", "TR", "TM", "TV", "UG", "UA", "AE", "GB", "US", "UY", "UZ", "VU", "VE", "VN", "YE", "ZM", "ZW" ), ]
trade_fdi <- trade_fdi[!(trade_fdi$country=="Eswatini"), ]
trade_fdi$ccode <- countrycode(trade_fdi$country, "country.name", "cown")
assert_that(!anyDuplicated(trade_fdi[, c("ccode", "year")]))


workerr_latent<- workerr_latent[, c("ccode", "year", "workerr.mean")]
assert_that(!anyDuplicated(workerr_latent[, c("ccode", "year")]))


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------#
#Setup balanced dataframe (needed later for imputing) and merge data 
unique.ccode <- unique(access_points$ccode)
unique.ccode<- rep(unique.ccode, 43)
year<- seq(1960, 2002, 1)
year<-rep(year, 128)
year<- sort(year)
dat<-data.frame(unique.ccode, year)
dat<- dat[order(unique.ccode,year), ]

colnames(dat) <- c("ccode", "year")


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------#

dat<- merge(dat, access_points, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 
dat<- merge(dat, gdp_pop, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 
dat<- merge(dat, conflict, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 
dat<- merge(dat, hr_scores, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 
dat<- merge(dat, imf_con, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 
dat<- merge(dat, oil, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 
dat<- merge(dat, trade_fdi, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 
dat<- merge(dat, untreaties, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 
dat<- merge(dat, V.Dem, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 
dat<- merge(dat, workerr_latent, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 
dat<- merge(dat, YouthBulges, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 

dat<- dat[, c("ccode", "year", "govt_parties", "electoral_districts", "federalism", "pooling", "presidentialism", "access_points", "population", "realgdppccurrent", "conflict", "latentmean", "imf_con", "oil_gas_valuePOP_2014", "trade", "fdi", "cescr_ratify", "cat_ratify", "polity", "workerr.mean", "ythblgap")]


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------#
#create variable lags 

library(dplyr)
library(tidyverse)
library(plyr)

dat <- ddply(dat, .(ccode), transform, l.govt_parties = 
                  c(NA, govt_parties[-length(govt_parties)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.electoral_districts = 
                  c(NA, electoral_districts[-length(electoral_districts)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.federalism = 
                  c(NA, federalism[-length(federalism)]
                  )
)


dat <- ddply(dat, .(ccode), transform, l.pooling = 
                  c(NA, pooling[-length(pooling)]
                  )
)


dat <- ddply(dat, .(ccode), transform, l.presidentialism = 
                  c(NA, presidentialism[-length(presidentialism)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.access_points = 
                  c(NA, access_points[-length(access_points)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.population = 
                  c(NA, population[-length(population)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.realgdppccurrent = 
                  c(NA, realgdppccurrent[-length(realgdppccurrent)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.conflict = 
                  c(NA, conflict[-length(conflict)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.latentmean = 
                  c(NA, latentmean[-length(latentmean)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.imf_con = 
                  c(NA, imf_con[-length(imf_con)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.oil_gas_valuePOP_2014 = 
                  c(NA, oil_gas_valuePOP_2014[-length(oil_gas_valuePOP_2014)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.trade = 
                  c(NA, trade[-length(trade)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.fdi = 
                  c(NA, fdi[-length(fdi)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.cescr_ratify = 
                  c(NA, cescr_ratify[-length(cescr_ratify)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.cescr_ratify = 
                  c(NA, cescr_ratify[-length(cescr_ratify)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.cat_ratify = 
                  c(NA, cat_ratify[-length(cat_ratify)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.polity = 
                  c(NA, polity[-length(polity)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.workerr.mean = 
                  c(NA, workerr.mean[-length(workerr.mean)]
                  )
)

dat <- ddply(dat, .(ccode), transform, l.ythblgap = 
                   c(NA, ythblgap[-length(ythblgap)]
                   )
)

#Tranform data 
dat$l.log.pop <- log(dat$l.population)
dat$l.log.realgdppccurrent <- log(dat$l.realgdppccurrent) 
dat$l.log.oil <- log(dat$oil_gas_valuePOP_2014 + 1)
dat$l.fdi.permillion <- dat$l.fdi / 1000000
dat$l.polity <- ifelse(dat$l.polity %in% c(-88, -77, -66), NA, dat$l.polity)


### Adding CSO Consult and latent variable standard deviations 

V.Dem <- read_csv("./Source Data/V-Dem-CY+Others-v8.csv")
V.Dem <- V.Dem[, c("COWcode", "year", "v2csprtcpt", "v2x_cspart")]
colnames(V.Dem) <- c("ccode", "year", "CSOparticipation", "CSO") 
V.Dem <- V.Dem[V.Dem$year>1944, ]
V.Dem <- V.Dem[V.Dem$ccode!=99999, ]
assert_that(!anyDuplicated(V.Dem[, c("ccode", "year")]))

workerr_latent <- read.csv("./Source Data/workerr_latent.csv")[,c("ccode", "year", "workerr.mean", "workerr.sd")]
colnames(workerr_latent) <- c("ccode", "year", "workerr.mean2", "workerr.sd2")



dat<- merge(final_dat, workerr_latent, by=c("ccode","year"), all.x=TRUE, all.y=FALSE) 
dat<- merge(dat, V.Dem, by=c("ccode", "year"), all.x=TRUE, all.y=FALSE)

library(dplyr)
library(tidyverse)
library(plyr)

dat <- ddply(dat, .(ccode), transform, l.CSOparticipation = 
               c(NA, CSOparticipation[-length(CSOparticipation)]
               )
)

dat <- ddply(dat, .(ccode), transform, l.CSO = 
               c(NA, CSO[-length(CSO)]
               )
)



#### This is the final_dat used 
write.csv(dat, "./final_dat.csv", row.names = FALSE)
